In [1]:
import plotly
plotly.offline.init_notebook_mode()
import requests as r
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import seaborn as sns
import time
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()
import warnings
warnings.simplefilter('ignore') #ignore the warnings, not the errors
In [2]:
df1=pd.read_csv('pap350.csv')
df2=pd.read_csv('seloger_complet.csv')
In [3]:
# adding source information and making it uniform

df1['Source'] = 'particulier'
df1.drop('District', axis=1, inplace=True)
df2['Source'] = 'agence'
In [4]:
# dropping csv created columns

df1.drop('Unnamed: 0', axis=1, inplace=True)
df1.drop('index', axis=1, inplace=True)
df2.drop('Unnamed: 0', axis=1, inplace=True)
df2.drop('Unnamed: 0.1', axis=1, inplace=True)
In [5]:
df=pd.DataFrame()
df=df.append(df1, ignore_index = True)
df=df.append(df2, ignore_index = True)
In [6]:
df
Out[6]:
Price N_rooms Surface_m2 Link Zone Price_per_m2 Source
0 1250 1 29.0 https://www.pap.fr/annonces/appartement-paris-... ParisSud 43.103448 particulier
1 950 1 30.0 https://www.pap.fr/annonces/appartement-paris-... ParisSud 31.666667 particulier
2 1015 2 35.0 https://www.pap.fr/annonces/appartement-paris-... ParisSud 29.000000 particulier
3 870 1 16.0 https://www.pap.fr/annonces/appartement-paris-... ParisSud 54.375000 particulier
4 950 1 23.0 https://www.pap.fr/annonces/appartement-paris-... ParisSud 41.304348 particulier
... ... ... ... ... ... ... ...
2680 800 1 12.0 https://www.seloger.com/annonces/locations/app... ParisOuest 66.666667 agence
2681 1000 1 19.0 https://www.seloger.com/annonces/locations/app... ParisOuest 52.631579 agence
2682 1150 1 20.0 https://www.seloger.com/annonces/locations/app... ParisOuest 57.500000 agence
2683 900 1 17.0 https://www.seloger.com/annonces/locations/app... ParisOuest 52.941176 agence
2684 870 1 15.0 https://www.seloger.com/annonces/locations/app... ParisOuest 58.000000 agence

2685 rows × 7 columns

Charts

Pie chart

In [7]:
import plotly.express as px
fig = px.pie(df, values='Price_per_m2', names='Source', title='Lovely pie chart',height=400, hole=.3)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

Dashboards by source

In [8]:
# dashboard DF1 PAP

_, axes=plt.subplots(2,2,figsize=(18,10))

axes[0, 0].set_title('Histogram', size=15)
axes[0, 1].set_title('Boxplot', size=15)
axes[1, 0].set_title('Price to m2 / Surface', size=15)
axes[1, 1].set_title('Price / Surface', size=15)

axes[0, 0].set(xlabel='Surface in m2', ylabel='Quantity')
axes[0, 1].set(ylabel='Surface in m2')
axes[1, 0].set(xlabel='Square meter price', ylabel='Surface in m2')
axes[1, 1].set(xlabel='Price in €', ylabel='Surface in m2')


axes[0,0].hist(df1['Price_per_m2'], bins=20)
axes[0,1].boxplot(df1['Surface_m2'])
sns.scatterplot(df1.Price_per_m2, df1.Surface_m2,ax=axes[1,0])
sns.scatterplot(df1.Price, df1.Surface_m2,ax=axes[1,1]);
In [34]:
# dashboard DF2 SELOGER 

_, axes=plt.subplots(2,2,figsize=(18,10))

axes[0, 0].set_title('Histogram', size=15)
axes[0, 1].set_title('Boxplot', size=15)
axes[1, 0].set_title('Price to m2 / Surface', size=15)
axes[1, 1].set_title('Price / Surface', size=15)

axes[0, 0].set(xlabel='Surface in m2', ylabel='Quantity')
axes[0, 1].set(ylabel='Surface in m2')
axes[1, 0].set(xlabel='Square meter price', ylabel='Surface in m2')
axes[1, 1].set(xlabel='Price in €', ylabel='Surface in m2')


axes[0,0].hist(df2['Price_per_m2'], bins=20)
axes[0,1].boxplot(df2['Surface_m2'])
sns.scatterplot(df2.Price_per_m2, df2.Surface_m2,ax=axes[1,0])
sns.scatterplot(df2.Price, df2.Surface_m2,ax=axes[1,1]);

# MAKE MARKERS SMALLER ? or set axis limits on graphs 2,3,4

Common dashboard

In [10]:
df_m2_lessthan_70=df.query('Surface_m2 < 70')
In [11]:
# dashboard

_, axes=plt.subplots(2,2,figsize=(18,10))

hspace = 1
wspace = 1

axes[0, 1].set_title('Histogram', size=15)
axes[0,0].set_title('Boxplot', size=15)
axes[1,1].set_title('Price of m2 / Surface', size=15)
axes[1, 0].set_title('Price / Surface', size=15)

axes[0, 1].set(xlabel='Price per m2', ylabel='Quantity')
axes[0,0].set(ylabel='Surface in m2')
axes[1,1].set(xlabel='Square meter price', ylabel='Surface in m2')
axes[1, 0].set(xlabel='Price in €', ylabel='Surface in m2')


axes[0,1].hist(df_m2_lessthan_70['Price_per_m2'], bins=20)
axes[0,0].boxplot(df_m2_lessthan_70['Surface_m2'])
sns.scatterplot(df.Price_per_m2, df_m2_lessthan_70.Surface_m2,ax=axes[1,1])
sns.scatterplot(df.Price, df_m2_lessthan_70.Surface_m2,ax=axes[1,0]);

Answer

In [36]:
import plotly.express as px

fig = px.scatter(x=df.Price, y=df.Surface_m2, color=df.Source, labels={"x": "Price in €",  "y": "Surface in m2"})

fig.show()
In [37]:
#interesting lines
In [38]:
import plotly.express as px

fig = px.scatter(x=df.Price_per_m2, y=df.Surface_m2, color=df.Source, labels={"x": "Price of m2 in €",  "y": "Surface in m2"})
fig.update_traces(marker=dict(size=6,
                              opacity=0.4,
                              line=dict(width=0.5,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
In [39]:
#boxplot answer
In [37]:
import plotly.express as px

fig = px.scatter(x=df.Price_per_m2, y=df.Surface_m2, color=df.Source,  labels={"x": "Price of m2 in €",  "y": "Surface in m2"})

fig.show()
In [13]:
median_agency=df.query('Source=="agence"').Price_per_m2.median()
median_pap=df.query('Source=="particulier"').Price_per_m2.median()
overpay=round(((median_agency-median_pap)/median_agency),2)
overpay_percent=int(overpay*100)*(-1)

median_m2_agency=df.query('Source=="agence"').Surface_m2.median()
median_m2_pap=df.query('Source=="particulier"').Surface_m2.median()
less_m2=int(median_m2_agency-median_m2_pap)

print("Median price per m2 of agency in Paris is: "+str(median_agency))
print("Median price per m2 of individual in Paris is: "+str(median_pap))
print(" ")
print("By renting from a individual, you will overpay on average "+str(overpay_percent)+"% of the price")
print("And for "+ str(less_m2)+" less m2!")
Median price per m2 of agency in Paris is: 37.24137931034483
Median price per m2 of individual in Paris is: 42.32065217391305
 
By renting from a individual, you will overpay on average 14% of the price
And for 4 less m2!
In [43]:
import plotly.express as px

fig = px.scatter(x=df1.Price_per_m2, y=df1.Surface_m2, opacity=0.9,   labels={"x": "Price of m2 in €",  "y": "Surface in m2"})

fig.update_traces(marker_color='rgba(255,0,255)')

fig2 = px.scatter(x=df2.Price_per_m2, y=df2.Surface_m2, opacity=0.1)

fig2.update_traces(marker_color='rgba(255,100,100)')

x = fig2.data[0] # second trace, first one is scatter

fig.add_trace(x)


fig.show()
In [44]:
import plotly.express as px

fig = px.scatter(x=df1.Price, y=df1.Surface_m2, opacity=0.9,   labels={"x": "Price of m2 in €",  "y": "Surface in m2"})

fig.update_traces(marker_color='rgba(255,0,255)')

fig2 = px.scatter(x=df2.Price, y=df2.Surface_m2, opacity=0.1)

fig2.update_traces(marker_color='rgba(255,100,100)')

x = fig2.data[0] # second trace, first one is scatter

fig.add_trace(x)


fig.show()

Paris by zone

In [14]:
import plotly.express as px

fig = px.scatter(x=df.Price_per_m2, y=df.Surface_m2, color=df.Zone, marginal_x="box", marginal_y="box", labels={"x": "Price for m2 in €",  "y": "Surface in m2"})

fig.show()

Tool for my search

In [15]:
import webbrowser
import plotly.graph_objs as go
import plotly.express as px

fig = go.FigureWidget(layout={'hovermode': 'closest'})
scatter = fig.add_scatter(x=df.Price, y=df.Surface_m2, mode='markers',fillcolor='azure')
data = fig.data[0]

fig2 = px.scatter(x=df.Price, y=df.Surface_m2, trendline="ols")

trendline = fig2.data[1] # second trace, first one is scatter
fig.add_trace(trendline)

fig.update_xaxes(title_text="Price in €")
fig.update_yaxes(title_text="Surface in m2")

def do_click(trace, points, state):
    if points.point_inds:
        ind = points.point_inds[0]
        link = df.Link.iloc[ind]
        webbrowser.open_new_tab(link)

        
data.on_click(do_click)
fig

separated by color but pap isn't opening

In [84]:
import webbrowser
import plotly.graph_objs as go
import plotly.express as px

fig = go.FigureWidget(layout={'hovermode': 'closest'})
scatter = fig.add_scatter(x=df2.Price, y=df2.Surface_m2, mode='markers',fillcolor='azure')
data = fig.data[0]

fig.update_traces(marker_color='rgba(255,100,100)')

fig2 = px.scatter(x=df.Price, y=df.Surface_m2, trendline="ols")

trendline = fig2.data[1] # second trace, first one is scatter
fig.add_trace(trendline)

fig3 = px.scatter(x=df1.Price, y=df1.Surface_m2)

fig3.update_traces(marker_color='rgba(255,0,0)')

pap=fig3.data[0]
fig.add_trace(pap)


fig.update_xaxes(title_text="Price in €")
fig.update_yaxes(title_text="Surface in m2")

def do_click(trace, points, state):
    if points.point_inds:
        ind = points.point_inds[0]
        link = df.Link.iloc[ind]
        webbrowser.open_new_tab(link)

        
data.on_click(do_click)
fig
In [17]:
# change opacity of a boxplot graph to clearly see PAP points keeping boxplots
# on tool, separate pap and seloger by color (OR make on hover tell if it's particulier or agence)
# hypothesis testing
# clicking charts do not work